{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 缺失值处理\n",
    "### 缺失值查看"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 5 entries, 0 to 4\n",
      "Data columns (total 4 columns):\n",
      "编号      4 non-null object\n",
      "年龄      4 non-null float64\n",
      "性别      3 non-null object\n",
      "注册时间    4 non-null datetime64[ns]\n",
      "dtypes: datetime64[ns](1), float64(1), object(2)\n",
      "memory usage: 240.0+ bytes\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\")\n",
    "df.head(20).info()#head()默认只显示前5条数据\n",
    "#df.info()#info()方法返回各个字段属性及每一列缺失数据的情况"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 缺失值删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A3</td>\n",
       "      <td>47.0</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A4</td>\n",
       "      <td>41.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号    年龄   性别       注册时间\n",
       "0  A1  54.0    男 2018-08-08\n",
       "1  A2  16.0  NaN 2018-08-09\n",
       "3  A3  47.0    女 2018-08-10\n",
       "4  A4  41.0    男 2018-08-11"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\")\n",
    "df.dropna() #dropna()删除缺失值的行\n",
    "df.dropna(how = \"all\")#删除所有列为空的行"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 缺失值填充"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>编号</th>\n",
       "      <th>年龄</th>\n",
       "      <th>性别</th>\n",
       "      <th>注册时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>54.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>16.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A3</td>\n",
       "      <td>30.0</td>\n",
       "      <td>女</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A4</td>\n",
       "      <td>41.0</td>\n",
       "      <td>男</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   编号    年龄 性别       注册时间\n",
       "0  A1  54.0  男 2018-08-08\n",
       "1  A2  16.0  男 2018-08-09\n",
       "2  A3  30.0  女 2018-08-10\n",
       "3  A4  41.0  男 2018-08-11"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name=1)\n",
    "df.fillna(0)#fillna将缺失值填充为0\n",
    "df.fillna({\"性别\":\"男\",\"年龄\":30})#分别对性别和年龄填充\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 重复数据处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>A1</td>\n",
       "      <td>张通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "0   A1   张通    101 2018-08-08\n",
       "1   A2   李谷    102 2018-08-09\n",
       "3   A3   孙凤    103 2018-08-10\n",
       "5   A5   赵恒    104 2018-08-11"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name=2)\n",
    "df.drop_duplicates() #删除重复的列\n",
    "df.drop_duplicates(subset = \"唯一识别码\") #指定判断的列\n",
    "df.drop_duplicates(subset = [\"客户姓名\",\"唯一识别码\"])\n",
    "df.drop_duplicates(subset = [\"客户姓名\",\"唯一识别码\"],keep = \"last\") #keep参数（first,last）设置保留那个值\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 异常值的检测与处理"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "对于异常值一般有以下几种处理方式：\n",
    "- 最常用的处理方式就是删除。\n",
    "- 把异常值当作缺失值来填充。\n",
    "- 把异常值当作特殊情况，研究异常值出现的原因"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据类型转换"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据类型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "类型 | 说明\n",
    "---|---\n",
    "int | 整型数，即整数\n",
    "flat | 浮点数，即含有小数点的数\n",
    "object | Python对象类型，用O表示\n",
    "string_ | 字符串类型，经常用S表示，S10表示长度为10的字符串\n",
    "unicode_ | 谷歌程度的unicode类型，跟字符串的定义方式一样\n",
    "datatime64[ns] | 表示时间格式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 6 entries, 0 to 5\n",
      "Data columns (total 4 columns):\n",
      "订单编号     6 non-null object\n",
      "客户姓名     6 non-null object\n",
      "唯一识别码    6 non-null int64\n",
      "成交时间     6 non-null datetime64[ns]\n",
      "dtypes: datetime64[ns](1), int64(1), object(2)\n",
      "memory usage: 272.0+ bytes\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "dtype('int64')"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name = 2)\n",
    "df.info() #info( )获取每一列的数据类型\n",
    "df[\"订单编号\"].dtype # 查看订单编号这一列的数据类型\n",
    "df[\"唯一识别码\"].dtype # 查看唯一识别码这一列的数据类型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 类型转换"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    101.0\n",
       "1    102.0\n",
       "2    103.0\n",
       "3    103.0\n",
       "4    104.0\n",
       "5    104.0\n",
       "Name: 唯一识别码, dtype: float64"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name = 2)\n",
    "df[\"唯一识别码\"].dtype #查看类型\n",
    "df[\"唯一识别码\"].astype(\"float64\")#将唯一识别码冲int类型转为float类型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 索引设置"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 为无索引表添加索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>订单编号</th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A1</td>\n",
       "      <td>张通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  订单编号 客户姓名  唯一识别码       成交时间\n",
       "1   A1   张通    101 2018-08-08\n",
       "2   A2   李谷    102 2018-08-09\n",
       "3   A3   孙凤    103 2018-08-10\n",
       "4   A4   赵恒    104 2018-08-11\n",
       "5   A5   赵恒    104 2018-08-11"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name = 3,header= None)\n",
    "df.columns = [\"订单编号\",\"客户姓名\",\"唯一识别码\",\"成交时间\"]#header需要设置为None，否则会覆盖第一行数据\n",
    "df.index = [1,2,3,4,5]\n",
    "df\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 重新设置索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>订单编号</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>A1</th>\n",
       "      <td>张通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A2</th>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A3</th>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A3</th>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A4</th>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A5</th>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     客户姓名  唯一识别码       成交时间\n",
       "订单编号                       \n",
       "A1     张通    101 2018-08-08\n",
       "A2     李谷    102 2018-08-09\n",
       "A3     孙凤    103 2018-08-10\n",
       "A3     孙凤    103 2018-08-10\n",
       "A4     赵恒    104 2018-08-11\n",
       "A5     赵恒    104 2018-08-11"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name = 2)\n",
    "df.set_index(\"订单编号\") #se_index()方法重新设置索引列"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 重命名索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>新订单编号</th>\n",
       "      <th>新客户姓名</th>\n",
       "      <th>唯一识别码</th>\n",
       "      <th>成交时间</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>一</th>\n",
       "      <td>A1</td>\n",
       "      <td>张通</td>\n",
       "      <td>101</td>\n",
       "      <td>2018-08-08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>二</th>\n",
       "      <td>A2</td>\n",
       "      <td>李谷</td>\n",
       "      <td>102</td>\n",
       "      <td>2018-08-09</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>三</th>\n",
       "      <td>A3</td>\n",
       "      <td>孙凤</td>\n",
       "      <td>103</td>\n",
       "      <td>2018-08-10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>四</th>\n",
       "      <td>A4</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>A5</td>\n",
       "      <td>赵恒</td>\n",
       "      <td>104</td>\n",
       "      <td>2018-08-12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  新订单编号 新客户姓名  唯一识别码       成交时间\n",
       "一    A1    张通    101 2018-08-08\n",
       "二    A2    李谷    102 2018-08-09\n",
       "三    A3    孙凤    103 2018-08-10\n",
       "四    A4    赵恒    104 2018-08-11\n",
       "5    A5    赵恒    104 2018-08-12"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name = 4)\n",
    "df.index = [1,2,3,4,5] #添加索引\n",
    "df.rename(columns={\"订单编号\":\"新订单编号\",\"客户姓名\":\"新客户姓名\"}) #重命名列索引\n",
    "df.rename(index = {1:\"一\",2:\"二\",3:\"三\"}) #重命名行索引\n",
    "df.rename(columns={\"订单编号\":\"新订单编号\",\"客户姓名\":\"新客户姓名\"},index = {1:\"一\",2:\"二\",3:\"三\",4:'四'})#同时重命名列和行索引"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 重置索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>level_0</th>\n",
       "      <th>level_1</th>\n",
       "      <th>C1</th>\n",
       "      <th>C2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Z1</td>\n",
       "      <td>Z2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A</td>\n",
       "      <td>a</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>b</td>\n",
       "      <td>3.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>B</td>\n",
       "      <td>a</td>\n",
       "      <td>5.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>b</td>\n",
       "      <td>7.0</td>\n",
       "      <td>8.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  level_0 level_1   C1   C2\n",
       "0      Z1      Z2  NaN  NaN\n",
       "1       A       a  1.0  2.0\n",
       "2     NaN       b  3.0  4.0\n",
       "3       B       a  5.0  6.0\n",
       "4     NaN       b  7.0  8.0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "df = pd.read_excel(r\"..\\Data\\Chapter05.xlsx\",sheet_name=5)\n",
    "df.reset_index()\n",
    "#详见第10章"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "第5章 数据预处理",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
